module Statsample
  # Multiset joins multiple dataset with the same fields and vectors
  # but with different number of cases. 
  # This is the base class for stratified and cluster sampling estimation
  class Multiset
    attr_reader :fields, :datasets
    # To create a multiset
    # * Multiset.new(%w{f1 f2 f3}) # define only fields
    def initialize(fields)
        @fields=fields
        @datasets={}
    end
    def self.new_empty_vectors(fields,ds_names) 
        ms=Multiset.new(fields)
        ds_names.each{|d|
            ms.add_dataset(d,Dataset.new(fields))
        }
        ms
    end
    def datasets_names
        @datasets.keys.sort
    end
    def n_datasets
        @datasets.size
    end
    def add_dataset(key,ds)
      if(ds.fields!=@fields)
       raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})"
      else
          @datasets[key]=ds
      end
    end
    def sum_field(field)
      @datasets.inject(0) {|a,da|
        stratum_name=da[0]
        vector=da[1][field]
        val=yield stratum_name,vector
        a+val
      }
    end
    def collect_vector(field)
      @datasets.collect {|k,v|
        yield k, v[field]
      }
    end
    def[](i)
      @datasets[i]
    end
  end
  class StratifiedSample
    class << self
      # mean for an array of vectors
      def mean(*vectors)
        n_total=0
        means=vectors.inject(0){|a,v|
          n_total+=v.size
          a+v.sum
        }
        means.to_f/n_total
      end
      
      def standard_error_ksd_wr(es)
        n_total=0
        sum=es.inject(0){|a,h|
            n_total+=h['N']
            a+((h['N']**2 * h['s']**2) / h['n'].to_f)
        }
        (1.to_f / n_total)*Math::sqrt(sum)
      end
      
      
      def variance_ksd_wr(es)
        standard_error_ksd_wr(es)**2
      end
      def calculate_n_total(es)
        es.inject(0) {|a,h| a+h['N'] }
      end
      # Source : Cochran (1972)
      
      def variance_ksd_wor(es)
      n_total=calculate_n_total(es)
      es.inject(0){|a,h|
        val=((h['N'].to_f / n_total)**2) * (h['s']**2 / h['n'].to_f) * (1 - (h['n'].to_f / h['N']))
        a+val
      }
      end
      def standard_error_ksd_wor(es)
        Math::sqrt(variance_ksd_wor(es))
      end
      
      
      
      def variance_esd_wor(es)
        n_total=calculate_n_total(es)
        sum=es.inject(0){|a,h|
          val=h['N']*(h['N']-h['n'])*(h['s']**2 / h['n'].to_f)
          a+val
        }
        (1.0/(n_total**2))*sum
      end
      
      
      def standard_error_esd_wor(es)
        Math::sqrt(variance_ksd_wor(es))
      end
      # Based on http://stattrek.com/Lesson6/STRAnalysis.aspx
      def variance_esd_wr(es)
        n_total=calculate_n_total(es)
          sum=es.inject(0){|a,h|
            val= ((h['s']**2 * h['N']**2) / h['n'].to_f)
            a+val
          }
          (1.0/(n_total**2))*sum
      end
      def standard_error_esd_wr(es)
        Math::sqrt(variance_esd_wr(es))
      end
      
      def proportion_variance_ksd_wor(es)
        n_total=calculate_n_total(es)
          es.inject(0){|a,h|
            val= (((h['N'].to_f / n_total)**2 * h['p']*(1-h['p'])) / (h['n'])) * (1- (h['n'].to_f / h['N']))
            a+val
          }
      end
      def proportion_sd_ksd_wor(es)
          Math::sqrt(proportion_variance_ksd_wor(es))
      end
      
      
      def proportion_sd_ksd_wr(es)
        n_total=calculate_n_total(es)
        sum=es.inject(0){|a,h|
          val= (h['N']**2 * h['p']*(1-h['p'])) / h['n'].to_f
          a+val
        }
        Math::sqrt(sum) * (1.0/n_total)
      end
      def proportion_variance_ksd_wr(es)
          proportion_variance_ksd_wor(es)**2
      end
      
      def proportion_variance_esd_wor(es)
        n_total=n_total=calculate_n_total(es)
        
        sum=es.inject(0){|a,h|
          a=(h['N']**2 * (h['N']-h['n']) * h['p']*(1.0-h['p'])) / ((h['n']-1)*(h['N']-1))
          a+val
        }
        Math::sqrt(sum) * (1.0/n_total**2)
      end
      def proportion_sd_esd_wor(es)
          Math::sqrt(proportion_variance_ksd_wor(es))
      end
    end
    
    def initialize(ms,strata_sizes)
      raise TypeError,"ms should be a Multiset" unless ms.is_a? Statsample::Multiset
      @ms=ms
      raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names
      @strata_sizes=strata_sizes
      @population_size=@strata_sizes.inject(0) {|a,x| a+x[1]}
      @strata_number=@ms.n_datasets
      @sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases}
    end
    # Number of strata
    def strata_number
      @strata_number
    end
    # Population size. Equal to sum of strata sizes
    # Symbol: N<sub>h</sub>
    def population_size
      @population_size
    end
    # Sample size. Equal to sum of sample of each stratum
    def sample_size
      @sample_size
    end
    # Size of stratum x
    def stratum_size(h)
      @strata_sizes[h]
    end
    def vectors_by_field(field)
      @ms.datasets.collect{|k,ds|
        ds[field]
      }
    end
    # Population proportion based on strata
    def proportion(field, v=1)
      @ms.sum_field(field) {|s_name,vector|
      stratum_ponderation(s_name)*vector.proportion(v)
      }
    end
    # Stratum ponderation.
    # Symbol: W\<sub>h\</sub>
    def stratum_ponderation(h)
      @strata_sizes[h].to_f / @population_size
    end
    alias_method :wh, :stratum_ponderation
    
    # Population mean based on strata
    def mean(field)
      @ms.sum_field(field) {|s_name,vector|
      stratum_ponderation(s_name)*vector.mean
      }
    end
    # Standard error with estimated population variance and without replacement.
    # Source: Cochran (1972)
    def standard_error_wor(field)
      es=@ms.collect_vector(field) {|s_n, vector|
        {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
      }
      
      StratifiedSample.standard_error_esd_wor(es)
    end
    
    # Standard error with estimated population variance and without replacement.
    # Source: http://stattrek.com/Lesson6/STRAnalysis.aspx
    
    def standard_error_wor_2(field)
      sum=@ms.sum_field(field) {|s_name,vector|
        s_size=@strata_sizes[s_name]
      (s_size**2 * (1-(vector.size.to_f / s_size)) * vector.variance_sample / vector.size.to_f)
      }
      (1/@population_size.to_f)*Math::sqrt(sum)
    end
    
    def standard_error_wr(field)
      es=@ms.collect_vector(field) {|s_n, vector|
        {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
      }
      
      StratifiedSample.standard_error_esd_wr(es)
    end
    def proportion_sd_esd_wor(field,v=1)
      es=@ms.collect_vector(field) {|s_n, vector|
        {'N'=>@strata_sizes[s_n],'n'=>vector.size, 'p'=>vector.proportion(v)}
      }
      
      StratifiedSample.proportion_sd_esd_wor(es)
    end
    
    def proportion_standard_error(field,v=1)
      prop=proportion(field,v)
      sum=@ms.sum_field(field) {|s_name,vector|
        nh=vector.size
        s_size=@strata_sizes[s_name]
        (s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
      }
      (1.quo(@population_size)) * Math::sqrt(sum)
    end
    # Cochran(1971), p. 150 
    def variance_pst(field,v=1)
      sum=@ms.datasets.inject(0) {|a,da|
        stratum_name=da[0]
        ds=da[1]
        nh=ds.cases.to_f
        s_size=@strata_sizes[stratum_name]
        prop=ds[field].proportion(v)
        a + (((s_size**2 * (s_size-nh)) / (s_size-1))*(prop*(1-prop) / (nh-1)))
      }
      (1/@population_size.to_f ** 2)*sum
    end
  end
end
